Cross Referencing Endowment Values
endowment_data <- read_rds("./data/endowment_filter_data_990.RDS")
companies_to_ein <- read_csv("./data/companies.csv") %>%
mutate(EIN = as.character(ein))# make kable table with consistent formatting
make_table <- function(df, title = "", ...) {
title <- paste0("<center><span style = 'font-size:150%;color:black'><b>",
title,
"</span></b><center>")
as_tibble(df) %>%
kbl(caption = title, ... ) %>%
kable_material() %>%
row_spec(row=0, background = "#43494C" , color = "white", bold = TRUE)
}plot_missing <- function(variable) {
endowment_data %>%
group_by(EIN) %>%
summarize(number_observations = sum(!is.na(!!sym(variable)))) %>%
group_by(number_observations) %>%
summarize(n=n()) %>%
ggplot(aes(x = number_observations, y =n ))+
geom_bar(stat="identity") +
labs(y = "Number of Companies",
x = paste0("Number of Observations where\n",
variable, " was Not Missing"),
title =paste0("Missingness for ", variable)) +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = .5))
}
# note -- only works if years are consecutive
check_variable <- function(variable_name,
data) {
base_name <- variable_name
var <- paste0("CY", base_name)
vars <- paste0("CYM", c( 1:4), base_name)
# plt <- plot_missing(var)
# print(plt)
eins_with_variable <- data %>%
group_by(EIN) %>%
summarize(number_observations = sum(!is.na(!!sym(var)))) %>%
filter(number_observations != 0) %>%
pull(EIN)
# the goal here is to create a row for each fiscal year, with NAs if
# there are no observations for that year
# this is needed so that we have consecutive years, which is important
# for substraction using lag() to work correctly
data <- data %>%
filter(EIN %in% eins_with_variable) %>%
select(EIN, fiscal_year, contains(base_name)) %>%
pivot_wider(names_from = fiscal_year,
# names_prefix = "fiscalyear",
values_from=contains(base_name)) %>%
pivot_longer(cols = contains(base_name),
names_to = "variable_year") %>%
separate(variable_year, sep = "_", into = c("variable_name", "fiscal_year")) %>%
pivot_wider(names_from = variable_name, values_from = value) %>%
mutate(fiscal_year = as.factor(as.numeric(fiscal_year)))
crossref <- data %>%
group_by(EIN) %>%
arrange(fiscal_year) %>%
# lag corresponds to how far back the current year comparison should be
# vars contains the CM* variables that represent reporting for years back
# compare these CM* variables to the lagged current year variables
mutate(
difference_in_reported_year1 = !!sym(vars[1]) -
lag(!!sym(var), n =1),
difference_in_reported_year2 = !!sym(vars[2]) -
lag(!!sym(var), n =2),
difference_in_reported_year3 = !!sym(vars[3]) -
lag(!!sym(var), n =3),
difference_in_reported_year4 = !!sym(vars[4]) -
lag(!!sym(var), n =4)
) %>%
ungroup()
}Cross Referencing Beginning Year Balance Amount
Comparison Across Years
As we might expect, we see that a higher proportion had a nonzero difference between the cross referenced reports for years further back in time. That is, reporting tended to be more accurate for most recent years.
crossref <- check_variable("BeginningYearBalanceAmt", data = endowment_data)
plot_missing("CYBeginningYearBalanceAmt")# stacked chart, note we can't see how nonzero counts are changing
# relative to the total counts
crossref %>%
select(EIN, contains("difference"), fiscal_year) %>%
pivot_longer(cols = contains("difference")) %>%
filter(!is.na(value)) %>%
group_by(name) %>%
summarize(zero = sum(ifelse(value == 0, 1,0)),
nonzero = sum(ifelse(value == 0, 0,1))) %>%
# notice each row represents a fiscal_year-EIN-difference_type
pivot_longer(cols = c(zero, nonzero),
names_to = "source",
values_to = "count") %>%
mutate(name = gsub("difference_in_reported_year", "", name),
source = ifelse(source == "nonzero",
"Nonzero Difference",
"Zero Difference")) %>%
ggplot(aes(x=name, y = count, fill = source)) +
geom_bar(stat ="identity", position = "stack", alpha = .8) +
geom_label(aes(label = round(count,2), y = count, color = source),
position = "stack",
size = 2.6,
label.padding = unit(.1, "lines"),
fill = "white",
fontface="bold",
show.legend = FALSE) +
labs(title = "Number of Zero and Nonzero Differences\nBetween Cross Referenced Reports",
subtitle = "By Year",
x = "Years Between Reports Compared",
y = "Count",
fill = "") +
theme_bw() +
theme(plot.title = element_text(size = 16, hjust = .5, face="bold"),
plot.subtitle = element_text(hjust = .5, face="italic"),
axis.text.x = element_text(size = 13),
axis.title = element_text(size = 16, face = "bold"))# plot fraction where there was a difference between
# the reports by year
crossref %>%
select(EIN, contains("difference")) %>%
pivot_longer(cols = contains("difference")) %>%
filter(!is.na(value)) %>%
group_by(name) %>%
summarize(number_zeros = sum(ifelse(value == 0, 1,0)),
total_reports = n(),
fraction = 1-( number_zeros / total_reports)) %>%
mutate(name = gsub("difference_in_reported_year", "", name)) %>%
ggplot(aes(x=name, y = fraction)) +
geom_bar(stat ="identity", fill = "#234A77") +
geom_label(aes(label = round(fraction,2))) +
labs(title = paste0("Fraction of Differences that Were Nonzero\n",
"Between Cross Referenced Reports"),
subtitle = "By Year",
x = "Years Between Reports Compared",
y = "Fraction with Nonzero Difference") +
theme_bw() +
theme(plot.title = element_text(hjust = .5, face="bold"),
plot.subtitle = element_text(hjust = .5, face="italic"))We also see we have fewer total counts of non-missing reports as we go further back in time.
# plot counts
crossref %>%
select(EIN, contains("difference")) %>%
pivot_longer(cols = contains("difference")) %>%
filter(!is.na(value)) %>%
group_by(name) %>%
summarize(zero = sum(ifelse(value == 0, 1,0)),
nonzero = sum(ifelse(value == 0, 0,1))) %>%
pivot_longer(cols = c(zero, nonzero),
names_to = "source",
values_to = "count") %>%
mutate(name = gsub("difference_in_reported_year", "", name)) %>%
ggplot(aes(x=name, y = count, fill = source)) +
geom_bar(stat ="identity", position = "stack") +
geom_label(aes(label = round(count,2), y = count),
position = "stack",
size = 2.6,
label.padding = unit(.1, "lines"),
show.legend = FALSE) +
labs(title = "EINs with Zero and Nonzero Difference\nBetween Cross Referenced Reports",
subtitle = "By Year",
x = "Years Between Reports Compared",
y = "Count") +
theme_bw() +
theme(plot.title = element_text(size = 16, hjust = .5, face="bold"),
plot.subtitle = element_text(hjust = .5, face="italic"),
axis.text.x = element_text(size = 13),
axis.title = element_text(size = 16, face = "bold"))Companies with Discordance in Reported Values
# difference represents What They Reported as CY Minus X Years - What They Reported at The Time
companies_different <- crossref %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
filter(value > 0) %>%
left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
arrange(organization_name) %>%
pull(EIN) %>%
unique()
crossref %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
filter(value > 0) %>%
left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
mutate(year = substr(name, nchar(name), nchar(name)),
year = paste0("Comparing Current<br> Year Minus ",
year)) %>%
arrange(organization_name) %>%
select(`Organization Name` = organization_name,
`Difference in Years` = year,
`Fiscal Year` = fiscal_year,
`Recent - Previously Reported` = value) %>%
make_table(title = paste0(
"Comparing Values Reported in More Recent Report to Those Previously Reported:<br>",
"<i>Number of Companies that have at Least One Report Not Concordant: </i>",
length(companies_different)),
digits = 3,
format.args = list(
big.mark = ",",
scientific = FALSE),
escape=FALSE,
booktabs=TRUE) %>%
scroll_box(height = "450px",
width = "100%") | Organization Name | Difference in Years | Fiscal Year | Recent - Previously Reported |
|---|---|---|---|
| Ballet Arizona |
Comparing Current Year Minus 1 |
2018 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 2 |
2018 | 500,000 |
| Ballet Arizona |
Comparing Current Year Minus 2 |
2019 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 3 |
2019 | 500,000 |
| Ballet Arizona |
Comparing Current Year Minus 3 |
2020 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 4 |
2020 | 500,000 |
| Fort Wayne Ballet |
Comparing Current Year Minus 1 |
2018 | 26,128 |
| Fort Wayne Ballet |
Comparing Current Year Minus 1 |
2019 | 13,343 |
| Fort Wayne Ballet |
Comparing Current Year Minus 2 |
2019 | 148,799 |
| Fort Wayne Ballet |
Comparing Current Year Minus 2 |
2020 | 13,343 |
| Fort Wayne Ballet |
Comparing Current Year Minus 3 |
2020 | 148,799 |
| Pacific Northwest Ballet |
Comparing Current Year Minus 1 |
2019 | 3,000 |
| Pacific Northwest Ballet |
Comparing Current Year Minus 2 |
2020 | 3,000 |
| San Francisco Ballet |
Comparing Current Year Minus 1 |
2017 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2017 | 105,867,772 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2018 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2018 | 105,867,772 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2019 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2019 | 105,867,772 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2020 | 107,033,401 |
| The Alabama Ballet |
Comparing Current Year Minus 1 |
2019 | 227,040 |
| The Alabama Ballet |
Comparing Current Year Minus 2 |
2020 | 227,040 |
| The Alabama Ballet |
Comparing Current Year Minus 3 |
2020 | 219,787 |
| The Alabama Ballet |
Comparing Current Year Minus 4 |
2020 | 254,152 |
We see that values are repeated because if there is some value that is quite off, say for 2016, then this shows up in the CYM1 for 2017, but also CYM2 for 2018, CYM3 for 2019 and so on.
Tables of Reported Values for Each Company with Discordance in Reported Values
Observations:
- We see in some cases, the problematic reports are clear initially. This is the case in San Francisco Ballet, Ballet Arizona, or the Alabama Ballet.
- The differences for Fort Wayne Ballet and the Pacific Northwest Ballet are more subtle.
# iterate through EINs where there was discordance and
# generate a table so we can better see what's going on
variable_name <- "BeginningYearBalanceAmt"
walk(1:length(companies_different), ~{
name <- companies_to_ein %>%
filter(EIN == companies_different[.x]) %>%
pull(organization_name)
table <- crossref %>%
rename_with(cols=everything(), ~gsub(variable_name, "", .)) %>%
filter(EIN %in% companies_different[.x]) %>%
select(-c(EIN, contains("difference"))) %>%
make_table(title = paste0("Reports for ",
name, "<br>EIN: ",
companies_different[.x],
", Variable: ", variable_name))
print(table)
# print(table)
})| fiscal_year | CY | CYM1 | CYM2 | CYM3 | CYM4 |
|---|---|---|---|---|---|
| 2014 | NA | NA | NA | NA | NA |
| 2015 | 101399 | 100399 | 100399 | 100399 | 100399 |
| 2016 | 101399 | 101399 | 100399 | 100399 | 100399 |
| 2017 | 101399 | 101399 | 101399 | 100399 | 100399 |
| 2018 | 4746867 | 4126424 | 601399 | NA | NA |
| 2019 | 4732822 | 4746867 | 4126424 | 601399 | NA |
| 2020 | 4670922 | 4732822 | 4746867 | 4126424 | 601399 |
| 2021 | NA | NA | NA | NA | NA |
| fiscal_year | CY | CYM1 | CYM2 | CYM3 | CYM4 |
|---|---|---|---|---|---|
| 2014 | NA | NA | NA | NA | NA |
| 2015 | NA | NA | NA | NA | NA |
| 2016 | NA | NA | NA | NA | NA |
| 2017 | 1264981 | 1191922 | 1174150 | 36538 | 36538 |
| 2018 | 1413780 | 1291109 | 1219104 | 1201082 | 60137 |
| 2019 | 1415612 | 1427123 | 1413780 | 1291109 | 1219104 |
| 2020 | 1422619 | 1415612 | 1427123 | 1413780 | 1291109 |
| 2021 | NA | NA | NA | NA | NA |
| fiscal_year | CY | CYM1 | CYM2 | CYM3 | CYM4 |
|---|---|---|---|---|---|
| 2014 | NA | NA | NA | NA | NA |
| 2015 | 16919887 | 15702692 | 14297377 | 14671226 | 12528462 |
| 2016 | 17026097 | 16919887 | 15702692 | 14297377 | 14671226 |
| 2017 | 15778987 | 17026097 | 16919887 | 15702692 | 14297377 |
| 2018 | 18145008 | 15778987 | 17026097 | 16919887 | 15702692 |
| 2019 | 19254508 | 18148008 | 15778987 | 17026097 | 16919887 |
| 2020 | 19741644 | 19254508 | 18148008 | 15778987 | 17026097 |
| 2021 | NA | NA | NA | NA | NA |
| fiscal_year | CY | CYM1 | CYM2 | CYM3 | CYM4 |
|---|---|---|---|---|---|
| 2014 | NA | NA | NA | NA | NA |
| 2015 | 174 | 174 | 174 | 1035814 | 2318646 |
| 2016 | 174 | 174 | 174 | 174 | 1035814 |
| 2017 | 100219009 | 107033575 | 105867946 | 92513161 | 79137681 |
| 2018 | 119047942 | 100219009 | 107033575 | 105867946 | 92513161 |
| 2019 | 125015507 | 119047942 | 100219009 | 107033575 | 105867946 |
| 2020 | 124452193 | 125015507 | 119047942 | 100219009 | 107033575 |
| 2021 | NA | NA | NA | NA | NA |
| fiscal_year | CY | CYM1 | CYM2 | CYM3 | CYM4 |
|---|---|---|---|---|---|
| 2014 | NA | NA | NA | NA | NA |
| 2015 | NA | NA | NA | NA | NA |
| 2016 | 250000 | 250000 | 250000 | 250000 | 250000 |
| 2017 | 250000 | 250000 | 250000 | 250000 | 250000 |
| 2018 | 250000 | 250000 | 250000 | 250000 | 250000 |
| 2019 | 446591 | 477040 | 250000 | 250000 | 250000 |
| 2020 | 430113 | 446591 | 477040 | 469787 | 504152 |
| 2021 | NA | NA | NA | NA | NA |
crossref %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
# filter(value > 0) %>%
left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
mutate(year = substr(name, nchar(name), nchar(name)),
year = paste0("Comparing Current Year Minus ",
year)) %>%
arrange(organization_name) %>% View()Cross Referencing All Endowment Variables
Missingness by Variable
variables_to_check <- endowment_data %>%
select(contains("CY")) %>%
colnames() %>%
gsub("CY|CYM.", "",.) %>%
unique()
crossref_all <- map_df(
variables_to_check,
~{ variable_name <- .x
check_variable(variable_name,
data = endowment_data) %>%
# remove variable name part of column name
# so we can bind rows together, add this information
# as a separate column
rename_with(cols=everything(),
~gsub(variable_name, "", .)) %>%
mutate(variable = .x)
})
missing_all <- map_df( variables_to_check,
~ {variable <- paste0("CY",.x)
endowment_data %>%
group_by(EIN) %>%
summarize(number_observations = sum(!is.na(!!sym(variable)))) %>%
group_by(number_observations) %>%
summarize(number_eins=n()) %>%
mutate(variable = variable)
})colors <- c("#58b5e1", "#49406e", "#9dd84e", "#6633b4", "#46ebdc")
missing_all %>%
mutate(number_observations = paste0(
"Number of EINS with ",
number_observations,
" Observations for this Variable" )) %>%
ggplot(aes(x = variable, y =number_eins, fill = variable))+
geom_bar(stat="identity",
position = "dodge",
show.legend=FALSE) +
geom_label(aes(label = number_eins,
color = variable),
fill = "white",
vjust = .5,
size = 2,
position = position_dodge(1),
label.padding = unit(.1, "lines"),
show.legend=FALSE) +
facet_wrap(~number_observations, ncol=1) +
coord_flip() +
labs(y = "Number of Companies",
x = "Variable Name",
title = "Comparing Missingness Across Variables") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = .5),
axis.title = element_text(face = "bold")) +
scale_fill_manual(values = colors) +
scale_color_manual(values = colors) +
scale_y_continuous(n.breaks = 8) Fraction Discordant by Variable
# plot fraction discordant for each variable
crossref_all %>%
select(EIN, contains("difference"), variable) %>%
pivot_longer( contains("difference")) %>%
filter(!is.na(value)) %>%
group_by(variable) %>%
summarize(number_of_discordant_observations = sum(value > 1),
total_observations_of_variable = n(),
fraction_discordant = number_of_discordant_observations / total_observations_of_variable) %>%
ggplot(aes(x = fct_reorder(variable,
fraction_discordant,
.desc = TRUE),
y = fraction_discordant)) +
geom_bar(stat="identity",
fill = "#234A77")+
geom_label(aes(label = round(fraction_discordant, 3))) +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = .5, size = 16),
axis.title = element_text(face = "bold", size =16),
axis.text.x = element_text(size = 12, angle = 10, vjust = .6)) +
labs(y = "Fraction Discordant",
x = "Endowment Variable",
title = "Fraction of Observations that Were Discordant for Each Variable")get_discordant_table <- function(variable_name, data) {
cross_ref_for_var <- data %>%
filter(variable == variable_name) %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
filter(value > 0)
discordant <- cross_ref_for_var %>%
pull(EIN) %>% unique()
cross_ref_for_var %>%
left_join(companies_to_ein, by = c("EIN" = "EIN")) %>%
mutate(year = substr(name, nchar(name), nchar(name)),
year = paste0("Comparing Current<br> Year Minus ",
year)) %>%
arrange(organization_name) %>%
select(`Organization Name` = organization_name,
`Difference in Years` = year,
`Fiscal Year` = fiscal_year,
`Recent - Previously Reported` = value) %>%
make_table(title = paste0("Variable: ",
variable_name,
"<br>Comparing Values Reported in More Recent Report to Those Previously Reported:<br>",
"<i>Number of Companies that have at Least One Report Not Concordant: </i>",
length(discordant)),
digits = 3,
format.args = list(
big.mark = ",",
scientific = FALSE),
escape=FALSE,
booktabs=TRUE) %>%
scroll_box(height = "450px",
width = "100%")
}
walk(variables_to_check, ~{
table_for_var <- get_discordant_table(.x, data = crossref_all)
print(table_for_var)
})| Organization Name | Difference in Years | Fiscal Year | Recent - Previously Reported |
|---|---|---|---|
| Ballet Arizona |
Comparing Current Year Minus 1 |
2018 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 2 |
2018 | 500,000 |
| Ballet Arizona |
Comparing Current Year Minus 2 |
2019 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 3 |
2019 | 500,000 |
| Ballet Arizona |
Comparing Current Year Minus 3 |
2020 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 4 |
2020 | 500,000 |
| Fort Wayne Ballet |
Comparing Current Year Minus 1 |
2018 | 26,128 |
| Fort Wayne Ballet |
Comparing Current Year Minus 1 |
2019 | 13,343 |
| Fort Wayne Ballet |
Comparing Current Year Minus 2 |
2019 | 148,799 |
| Fort Wayne Ballet |
Comparing Current Year Minus 2 |
2020 | 13,343 |
| Fort Wayne Ballet |
Comparing Current Year Minus 3 |
2020 | 148,799 |
| Pacific Northwest Ballet |
Comparing Current Year Minus 1 |
2019 | 3,000 |
| Pacific Northwest Ballet |
Comparing Current Year Minus 2 |
2020 | 3,000 |
| San Francisco Ballet |
Comparing Current Year Minus 1 |
2017 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2017 | 105,867,772 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2018 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2018 | 105,867,772 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2019 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2019 | 105,867,772 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2020 | 107,033,401 |
| The Alabama Ballet |
Comparing Current Year Minus 1 |
2019 | 227,040 |
| The Alabama Ballet |
Comparing Current Year Minus 2 |
2020 | 227,040 |
| The Alabama Ballet |
Comparing Current Year Minus 3 |
2020 | 219,787 |
| The Alabama Ballet |
Comparing Current Year Minus 4 |
2020 | 254,152 |
| Organization Name | Difference in Years | Fiscal Year | Recent - Previously Reported |
|---|---|---|---|
| Joffrey Ballet |
Comparing Current Year Minus 1 |
2017 | 41,702 |
| Joffrey Ballet |
Comparing Current Year Minus 2 |
2018 | 41,702 |
| Joffrey Ballet |
Comparing Current Year Minus 3 |
2019 | 41,702 |
| Joffrey Ballet |
Comparing Current Year Minus 4 |
2020 | 41,702 |
| San Francisco Ballet |
Comparing Current Year Minus 1 |
2017 | 3,440,416 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2017 | 6,457,496 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2018 | 3,440,416 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2018 | 6,457,496 |
| San Francisco Ballet |
Comparing Current Year Minus 1 |
2019 | 1,025,273 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2019 | 2,479,840 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2019 | 3,440,416 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2019 | 6,457,496 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2020 | 1,025,273 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2020 | 2,479,840 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2020 | 3,440,416 |
| Organization Name | Difference in Years | Fiscal Year | Recent - Previously Reported |
|---|---|---|---|
| Fort Wayne Ballet |
Comparing Current Year Minus 1 |
2018 | 2,568 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 1 |
2019 | 80,765 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 2 |
2020 | 80,765 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 1 |
2021 | 25,082 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 3 |
2021 | 80,765 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2017 | 1,133,639 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2018 | 1,133,639 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2019 | 1,133,639 |
| The Alabama Ballet |
Comparing Current Year Minus 3 |
2020 | 30,742 |
| Organization Name | Difference in Years | Fiscal Year | Recent - Previously Reported |
|---|---|---|---|
| Fort Wayne Ballet |
Comparing Current Year Minus 1 |
2019 | 3,149 |
| Fort Wayne Ballet |
Comparing Current Year Minus 2 |
2020 | 3,149 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 1 |
2020 | 300 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 2 |
2021 | 300 |
| San Francisco Ballet |
Comparing Current Year Minus 1 |
2017 | 5,795,836 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2017 | 5,498,836 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2018 | 5,795,836 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2018 | 5,498,836 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2019 | 5,795,836 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2019 | 5,498,836 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2020 | 5,795,836 |
| The Alabama Ballet |
Comparing Current Year Minus 3 |
2020 | 23,489 |
| The Alabama Ballet |
Comparing Current Year Minus 4 |
2020 | 25,201 |
| Organization Name | Difference in Years | Fiscal Year | Recent - Previously Reported |
|---|---|---|---|
| Ballet Arizona |
Comparing Current Year Minus 1 |
2018 | 973,030 |
| Ballet Arizona |
Comparing Current Year Minus 2 |
2018 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 3 |
2018 | 500,000 |
| Ballet Arizona |
Comparing Current Year Minus 2 |
2019 | 973,030 |
| Ballet Arizona |
Comparing Current Year Minus 3 |
2019 | 4,025,025 |
| Ballet Arizona |
Comparing Current Year Minus 4 |
2019 | 500,000 |
| Ballet Arizona |
Comparing Current Year Minus 3 |
2020 | 973,030 |
| Ballet Arizona |
Comparing Current Year Minus 4 |
2020 | 4,025,025 |
| Fort Wayne Ballet |
Comparing Current Year Minus 1 |
2018 | 38,699 |
| Fort Wayne Ballet |
Comparing Current Year Minus 2 |
2019 | 52,042 |
| Fort Wayne Ballet |
Comparing Current Year Minus 3 |
2020 | 52,042 |
| Joffrey Ballet |
Comparing Current Year Minus 1 |
2017 | 41,702 |
| Joffrey Ballet |
Comparing Current Year Minus 2 |
2018 | 41,702 |
| Joffrey Ballet |
Comparing Current Year Minus 3 |
2019 | 41,702 |
| Joffrey Ballet |
Comparing Current Year Minus 4 |
2020 | 41,702 |
| Pacific Northwest Ballet |
Comparing Current Year Minus 1 |
2019 | 3,000 |
| Pacific Northwest Ballet |
Comparing Current Year Minus 2 |
2020 | 3,000 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 1 |
2019 | 80,765 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 2 |
2020 | 80,765 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 1 |
2021 | 25,082 |
| Pittsburgh Ballet Theatre |
Comparing Current Year Minus 3 |
2021 | 80,765 |
| San Francisco Ballet |
Comparing Current Year Minus 1 |
2017 | 100,218,835 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2017 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 2 |
2018 | 100,218,835 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2018 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 3 |
2019 | 100,218,835 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2019 | 107,033,401 |
| San Francisco Ballet |
Comparing Current Year Minus 4 |
2020 | 100,218,835 |
| Texas Ballet Theater |
Comparing Current Year Minus 1 |
2016 | 143 |
| Texas Ballet Theater |
Comparing Current Year Minus 2 |
2017 | 143 |
| Texas Ballet Theater |
Comparing Current Year Minus 3 |
2018 | 143 |
| Texas Ballet Theater |
Comparing Current Year Minus 4 |
2019 | 143 |
| The Alabama Ballet |
Comparing Current Year Minus 1 |
2019 | 196,591 |
| The Alabama Ballet |
Comparing Current Year Minus 2 |
2020 | 196,591 |
| The Alabama Ballet |
Comparing Current Year Minus 3 |
2020 | 227,040 |
| The Alabama Ballet |
Comparing Current Year Minus 4 |
2020 | 219,787 |
Companies with Discordant Reporting for at Least One Variable
intersections <- crossref_all %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value, variable) %>%
filter(value > 0) %>%
group_by(variable) %>%
summarize(EINs = list(unique(EIN)))
crossref_all %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value, variable) %>%
filter(value > 0) %>%
group_by(EIN) %>%
summarize(variable = paste(unique(variable), collapse=",<br>")) %>%
left_join(companies_to_ein) %>%
arrange(organization_name) %>%
select(`Organization Name` = `organization_name`,
`Variables with Discordant Reporting` = variable) %>%
make_table(
title = "Companies with Discordant Reporting for at Least One Variable",
escape=FALSE)| Organization Name | Variables with Discordant Reporting |
|---|---|
| Ballet Arizona |
BeginningYearBalanceAmt, EndYearBalanceAmt |
| Fort Wayne Ballet |
BeginningYearBalanceAmt, InvestmentEarningsOrLossesAmt, OtherExpendituresAmt, EndYearBalanceAmt |
| Joffrey Ballet |
ContributionsAmt, EndYearBalanceAmt |
| Pacific Northwest Ballet |
BeginningYearBalanceAmt, EndYearBalanceAmt |
| Pittsburgh Ballet Theatre |
InvestmentEarningsOrLossesAmt, OtherExpendituresAmt, EndYearBalanceAmt |
| San Francisco Ballet |
BeginningYearBalanceAmt, ContributionsAmt, InvestmentEarningsOrLossesAmt, OtherExpendituresAmt, EndYearBalanceAmt |
| Texas Ballet Theater | EndYearBalanceAmt |
| The Alabama Ballet |
BeginningYearBalanceAmt, InvestmentEarningsOrLossesAmt, OtherExpendituresAmt, EndYearBalanceAmt |
Reduce(intersect, intersections$EINs) %>% length() ## [1] 1
Reduce(union, intersections$EINs) %>% length() ## [1] 8
variable_name <- "BeginningYearBalanceAmt"
plot_reported_for_variable <- function(variable_name, crossref, endowment) {
cross_ref_for_var <- crossref %>%
filter(variable == variable_name) %>%
pivot_longer(cols = contains("difference")) %>%
select(EIN, fiscal_year, name, value) %>%
filter(value > 0)
discordant <- cross_ref_for_var %>%
pull(EIN) %>% unique()
number_cols <- ifelse(length(discordant) <= 6, 1,2)
# plot the values for the year they correspond to so we can compare,
# for example, if CM1 for 2016 is the same as CY for 2015
endowment %>%
filter(EIN %in% discordant) %>%
select(EIN, fiscal_year, contains(variable_name)) %>%
group_by(EIN) %>%
arrange(fiscal_year) %>%
pivot_longer(3: ncol(.)) %>%
mutate(source = ifelse(grepl("CYM", name), substr(name, 1,4), "CY"),
year_lag = ifelse(grepl("CYM", name), substr(source, 4,4), 0),
year_lag = as.numeric(year_lag),
fiscal_year = as.integer(paste0(fiscal_year))) %>%
mutate(value_year = fiscal_year -year_lag
) %>%
left_join(companies_to_ein) %>%
mutate(organization_name = paste0(organization_name,
" (EIN: ", EIN, ")")) %>%
ggplot(aes(x = value_year, y = value)) +
geom_jitter(aes(fill=source), height =0,
width = .2,
alpha = .8,
size = 2.2,
shape =21,
color = "black",
stroke =.4) +
# geom_line(aes(group = source, color = source)) +
facet_wrap(~organization_name, scales= 'free_y', ncol = number_cols) +
scale_x_continuous(breaks = 2011:2021 ) +
scale_y_continuous(labels = comma) +
viridis::scale_fill_viridis(option="magma", discrete=TRUE) +
theme_bw() +
labs(x = "Fiscal Year",
y = "Reported Value (Dollars)",
title = paste0("Comparing Reported Values for ", variable_name),
subtitle = "Only Considering Companies with at Least One Discordant Value") +
theme(plot.title = element_text(
face = "bold",
hjust = .5,
size = 16),
axis.title = element_text(face = "bold", size =16),
axis.text = element_text(size = 12),
strip.text = element_text(face = "bold", size = 14),
plot.subtitle=element_text(size =14,
face="italic",
hjust = .5),
legend.text = element_text(size = 10),
legend.title = element_text(face = "bold", size = 12)) +
guides(legend = guide_legend(override.aes = list(size = 3)))
}
walk(unique(variables_to_check),
~ {plt <- plot_reported_for_variable(
variable_name = .x,
crossref = crossref_all,
endowment = endowment_data)
print(plt) })crossref %>%
select(EIN, contains("difference")) %>%
pivot_longer(cols = contains("difference")) %>%
group_by(name) %>%
mutate(count_na = sum(is.na(value)),
count_not_na = sum(!is.na(value))) %>%
ungroup() %>%
ggplot(aes(x = value)) +
# geom_boxplot() +
geom_histogram(bins = 50)
crossref %>%
select(EIN, contains("difference")) %>%
pivot_longer(cols = contains("difference")) %>%
group_by(name) %>%
mutate(count_na = sum(is.na(value)),
count_not_na = sum(!is.na(value))) %>%
ungroup() %>%
ggplot(aes(x = name, y=value)) +
geom_boxplot() +
geom_jitter(alpha = .5, height = 0, width = .01)
crossref %>%
select(EIN, contains("difference")) %>%
pivot_longer(cols = contains("difference")) %>%
group_by(name) %>%
mutate(count_na = sum(is.na(value)),
count_not_na = sum(!is.na(value))) %>%
ungroup() %>%
ggplot(aes(x = name, y = value)) +
# geom_boxplot() +
geom_density()
geom_histogram() +
facet_wrap(~name) +
scale_x_log10()